## -*- mode: Awk; -*-  vim: set filetype=awk : 

#bias

#Given a frequency counts of a population, generate numbers biased by those frequencies.

#For demo of this code, see [?biasDemo biasDemo].

#When not to use this code
#=========================

#If you sampling from small populations (say, two dozen or less), 
#just sample the population at the rate you want, throw those samples into an
#array, then select at item of that array at random.

#E.g. suppose you just want to pick at random from a set of quotes:

#  Quotes = "Small  things with great love. <br>-- Mother Teresa\n\
#            It's hard work to it look effortless.<br>-- Katarina Witt\n\
#            God bless us every one!.<br>-- Tiny Tim"
#   srand(systime() + PROCINFO["pid"])
#   n=split(Quotes,tmp,"\n")
#   print  tmp[int(rand()*n) + 1]

#Use the following code if you are sampling from larger populations with
#complex distributions.

#Code
#==== 

#Uses
#----

#@uses o math s2a

#Top-level Drivers
#-----------------

#sample
#++++++

#Sample from population of things.

 function sample(population,cdf,memo,   count,one) { 
     for(one in population) 
         count[population[one]]++
     return sampleCounts(count,cdf,memo)
 }

#sampleCounts
#++++++++++++

#Sample of a list of counted things.

 function sampleCounts(counts,cdf,memo,   all, one,x,f,i) {
     for(one in counts)
         all += counts[one]
     for(one in counts) {
         x = (1000*counts[one]/all)+ rand()/100
         f[++i]= x
         memo[x] = one
     }
     a2cdf(f,cdf)
     return i
 }

#Workers
#-------

#a2cdf
#+++++

#Prepare an array of numbers, prior to picking. 
#Internally, this code 
#converts an array of numbers into a cdf. From
#[http://www.aiaccess.net/English/Glossaries/GlosMod/e_gm_distribution_function.htm aiaccess.net]: 

#* A _cdf_ (or "cumulative distribtion function") is the proportion of the population whose value is less than x.  The _cdf_ of
#  a random variable is clearly a monotonously increasing (or more precisely, non decreasing) function from 0 to 1.

#In Awk, we can represent this as an array of  "_n_" symbols stored at
#locations 1,2,3..n and a set of increasing probabilities  locations
#-1,-2,-3,...-n (where -i is the probability of symbol i). 

#    This kind of array is generated by `a2cdf`:

 function a2cdf(a,cdf, sorted,n) {
     n= a2best(a,sorted)
     return sorted2cdf(n,sorted,cdf)
 }

#a2best
#++++++

#If "_a_" is an array of numbers,  then sort them in ascending order.
#If _max_ is supplied, return no more than the first _max_ number of values. 

 function a2best(a,best,  max,    sorted,n,i,j) {
                #must     opt     local
     n = asort(a,sorted);
     if (max && n >= max) {
         for(i = n - max + 1; i<=n; i++)
             best[++j] = sorted[i];
         n= max;
     } 
     else 
         for(i in sorted)
             best[i]=sorted[i];
     return n;
 }

#sorted2cdf
#++++++++++

 function sorted2cdf(n,sorted,cdf,    i,j,sum) {
     if (! (1 in sorted)) 
         return (cdf[0]=split("",cdf,""))
     sum = cdf[1] = sorted[1];
     for(i=2;i<=n;i++) { 
         sum += sorted[i];
         cdf[i] = cdf[i-1] + sorted[i];
     }
     for(i in cdf)  
         cdf[i] = cdf[i] / sum;
     for(i=1;i<=n;i++)
         cdf[-1*i] = sorted[i]
     cdf[0]=n
     return n
 }

#Pick
#++++

#Return one symbol from a _cdf_, biased by the _cdf_ probabilities. 
#If _skew_ is provided, bias the selection towards one end of the
#distribtion. The larger the _skew_ (above one), the more we favor
#things with higher probabilities. It _skew_ is one, the just
#draw from the supplied distribution.

 function pick(cdf,  skew,   i) {
               #must   opt     local
     i = pick1(cdf,cdf[0],skew)
     return cdf[-1*i]
 }
 function pick1(cdf,n,  skew,u,i,here) {
     skew = skew ? skew : 1
     u = rand()^skew 
     for(i=n-1;i>=1;i--) 
         if (u > cdf[i] ) 
             return i+1;
     return 1;
 }

#Author
#======

#Tim Menzies

